from bertopic import BERTopic
topic_model = BERTopic.load("/home/zhhuang/climate_policy_paper/code/model_save/bert_topic_country_expand_model")
/home/zhhuang/anaconda3/envs/climatepolicy/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
import pandas as pd
df = pd.read_excel("/home/zhhuang/climate_policy_paper/code/data/Topic_docs_time_country_expand.xlsx")
docs, timestamp = df["docs"].to_list(), df["Year"].to_list()
topic_model.get_topic_info()['Topic']
topic_model.get_document_info(docs)[["Topic", "Name", "Top_n_words", "Probability", "Representative_document"]]
| Topic | Name | Top_n_words | Probability | Representative_document | |
|---|---|---|---|---|---|
| 0 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.788525 | False |
| 1 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.342139 | False |
| 2 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.503221 | False |
| 3 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.100914 | False |
| 4 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.649000 | False |
| ... | ... | ... | ... | ... | ... |
| 68334 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.556607 | False |
| 68335 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.827829 | False |
| 68336 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.790249 | False |
| 68337 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.518765 | False |
| 68338 | -1 | -1_energy_development_management_project | energy - development - management - project - ... | 0.557813 | False |
68339 rows × 5 columns
counts = {}
for doc in docs:
for word in doc.split():
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(100):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
energy 36106 land 26216 development23921 plan 16674 forest 16482 national 16031 establish 15315 environmental15090 management14829 agricultural14231 activity 13924 public 13387 purpose 12978 article 12923 policy 12503 protection12038 resource 11674 production11124 measure 11046 project 10876 system 10747 emission 10632 sector 10620 provide 10296 procedure 9784 power 9776 objective 9773 environment 9730 product 9677 promote 9493 set 9430 water 9235 control 9002 gas 8972 include 8909 condition 8700 natural 8678 support 8539 regulation 8268 consist 8167 sustainable 8086 economic 8072 implementation 7909 efficiency 7882 service 7862 aim 7807 requirement 7789 renewable 7406 application 7261 country 7211 electricity 7184 ensure 7145 agreement 7117 organization 6941 rule 6866 standard 6865 implement 6717 government 6717 relate 6699 carry 6595 regulate 6522 develop 6519 framework 6369 action 6365 legal 6238 grant 6229 operation 6224 establishes 6187 plant 6125 authority 6050 program 5975 increase 5928 strategy 5905 level 5888 market 5833 rural 5755 function 5743 protect 5738 building 5695 person 5630 conservation 5606 process 5556 `` 5525 create 5488 term 5463 international 5460 minister 5430 source 5393 improve 5308 right 5287 investment 5258 local 5256 fuel 5172 climate 5167 quality 5142 waste 5124 tax 5104 reduce 5096 technical 5029 issue 4997
similar_topics, similarity = topic_model.find_topics("Transport", top_n=5)
topic_model.get_topic(similar_topics[0])
[('energy', 0.03230173130614682),
('adverse', 0.013766813290960673),
('modification', 0.013384015874985505),
('environment', 0.012575852856891081),
('hfc', 0.011520532554952024),
('assessment', 0.01110828408978698),
('ozone', 0.011048527597892805),
('transboundary', 0.010538213919751344),
('objective', 0.010021317175811909),
('tariff', 0.009750261281022469)]
similar_topics, similarity = topic_model.find_topics("Industry", top_n=5)
topic_model.get_topic(similar_topics[0])
[('architectural', 0.0240691542320956),
('territory', 0.022780009435618136),
('county', 0.02132256745756662),
('municipal', 0.02049388887075923),
('neighbourhood', 0.018203884240013932),
('village', 0.016495837073308834),
('architecture', 0.016006323381752823),
('territorial', 0.013546296462238094),
('district', 0.013263537723284712),
('municipality', 0.012657715219287767)]
similar_topics, similarity = topic_model.find_topics("Energy systems", top_n=5)
topic_model.get_topic(similar_topics[0])
[('biodiversity', 0.17937641584550487),
('biological', 0.050332011052528866),
('sectoral', 0.022775396263003653),
('strategic', 0.02205042804451),
('conserve', 0.019934529526093942),
('specie', 0.01947446686530582),
('management', 0.019097955609743363),
('genetic', 0.017997541413598488),
('equitable', 0.01706068145479469),
('objective', 0.01614543846081185)]
similar_topics, similarity = topic_model.find_topics("Buildings", top_n=5)
topic_model.get_topic(similar_topics[0])
[('architectural', 0.0240691542320956),
('territory', 0.022780009435618136),
('county', 0.02132256745756662),
('municipal', 0.02049388887075923),
('neighbourhood', 0.018203884240013932),
('village', 0.016495837073308834),
('architecture', 0.016006323381752823),
('territorial', 0.013546296462238094),
('district', 0.013263537723284712),
('municipality', 0.012657715219287767)]
similar_topics, similarity = topic_model.find_topics("AFOLU", top_n=5)
topic_model.get_topic(similar_topics[0])
[('phytosanitary', 0.12690569974001198),
('cadmium', 0.041165131035834794),
('dalbergia', 0.041088957935167134),
('nickel', 0.03952115331890425),
('biocidal', 0.038387871497668556),
('polycyclic', 0.03649408337469938),
('eucalyptus', 0.03169599469361471),
('ambient', 0.027820439480866342),
('dariniensis', 0.022501541167033873),
('establishes', 0.01995691635840361)]
len(docs)
68339
import os
images_path = "/home/zhhuang/climate_policy_paper/paper_images"
if not os.path.exists(images_path):
os.makedirs(images_path)
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
# pio.kaleido.scope.mathjax = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
fig = topic_model.visualize_barchart(top_n_topics=40, n_words=10, width=300, height=300)
pio.write_image(fig, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_barchart.svg')
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
# fig.write_html("/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.png", engine="kaleido")
# img_bytes = fig.to_image(format="png", width=600, height=350, scale=2)
# Image(img_bytes)
fig
# topic_model.visualize_barchart(top_n_topics = 20, n_words=10,width = 300, height= 300)
fig2 = topic_model.visualize_heatmap()
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
pio.write_image(fig2, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_heatmap.svg')
fig2
fig3 = topic_model.visualize_topics()
pio.write_image(fig3, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_visualize_topics.svg')
fig3
hierarchical_topics = topic_model.hierarchical_topics(docs)
# print(hierarchical_topics)
with pd.ExcelWriter("Topic_country_expand_hierarchical_topics.xlsx", engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
hierarchical_topics.to_excel(writer)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [20:37<00:00, 21.34s/it]
hierarchical_topics = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_country_expand_hierarchical_topics.xlsx")
fig4 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
pio.write_image(fig4, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_hierarchical_topics.svg')
fig4
for index, i in enumerate(timestamp):
if i == '0':
timestamp[index] = '2020'
else:
timestamp[index] = str(i)
topics_over_time = topic_model.topics_over_time(docs, timestamp, datetime_format="%Y", nr_bins=20)
with pd.ExcelWriter("Topic_country_expand_topics_over_time.xlsx", engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
topics_over_time.to_excel(writer)
19it [2:29:15, 471.35s/it]
topics_over_time = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_country_expand_topics_over_time.xlsx")
fig5 = topic_model.visualize_topics_over_time(topics_over_time)
pio.write_image(fig5, '/home/zhhuang/climate_policy_paper/paper_images/topic_country_expand_visualize_topics_over_time.svg')
fig5